import create_container_structures
import Department
import query
import sort_tokens_by_department
import tokenizer
import generate_pie_plots_per_dept
import heatmap
import numpy as np
import re
import nltk
from graphviz import Digraph
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
import networkx as nx
class Department():
# class initialization
def __init__(self, name):
self.name = name
self.courses,self.descriptions = None,None
self.preq,self.professors = {},None
self.evals = None
self.gpa_median,self.time_median = None,None
# convert 93.6% to 0.936
def stringPercentToFloat(self,percent):
return round(float(percent[:-1]) / 100,3)
# load all the CAPE data into self.courses
def loadCourses(self,file):
s = np.loadtxt(file,dtype = 'str',delimiter = "\t")
res,evals = [],[]
for i in range(s.shape[0]):
row,row2 = [],[]
course = s[i][1].split('-')
row.append(course[0].strip()); row.append(course[1].strip())
row.append(s[i][0]); row.append(s[i][2])
row.append(self.stringPercentToFloat(s[i][5]))
row.append(self.stringPercentToFloat(s[i][6]))
row.append(float(s[i][7]))
if (s[i][-2] != 'N/A'): row.append(float(re.findall(r'[^()]+', s[i][-2])[1]))
else: row.append(0.0)
if (s[i][-1] != 'N/A'): row.append(float(re.findall(r'[^()]+', s[i][-1])[1]))
else: row.append(0.0)
res.append(row)
row2.append(s[i][3]); row2.append(s[i][4])
evals.append(row2)
self.courses = np.array(res)
self.evals = np.array(evals)
# load all the descriptions into self.description
def loadDescription(self,filename):
D,res = [],[]
with open(filename,'r',errors='ignore') as file:
for line in file:
if line != '\n': D.append(line)
for i in range(len(D) // 2):
row = []
course = D[2*i].split('.')
row.append(course[0].strip()); row.append(course[1].strip())
tokens = nltk.word_tokenize(D[2*i+1].lower().strip())
row.append(np.array(tokens))
res.append(row)
self.descriptions = np.array(res)
# calculate the median GPA of the department using all the CAPE reviews (with average GPA received)
def Median_GPA(self):
mask = np.where(self.courses[:,-1] != '0.0')[0]
target = self.courses[mask][:,-1].astype(np.float)
self.gpa_median = np.median(target)
return np.median(target)
# calculate the median time spent of the department using all the CAPE reviews
def Median_Time(self):
mask = np.where(self.courses[:,6] != '0.0')[0]
target = self.courses[mask][:,6].astype(np.float)
self.time_median = np.median(target)
return np.median(target)
# discard CAPE reviews that have no match to course descriptions (course no longer offered)
def cleanData(self):
s = np.array(list(set(self.descriptions[:,0])))
mask = np.isin(self.courses[:,0],s)
self.courses = self.courses[np.where(mask)]
self.professors = np.array((list(set(self.courses[:,2]))))
mask2 = np.where(self.courses[:,-1] == '0.0')[0]
median = self.Median_GPA()
self.Median_Time()
for idx in mask2:
self.courses[idx][-1] = str(median)
ece = Department('ECE')
ece.loadCourses('ECE_CAPE.txt')
ece.loadDescription('ECE_Description.txt')
ece.cleanData()
cse = Department('CSE')
cse.loadCourses('CSE_CAPE.txt')
cse.loadDescription('CSE_Description.txt')
cse.cleanData()
math = Department('MATH')
math.loadCourses('MATH_CAPE.txt')
math.loadDescription('MATH_Description.txt')
math.cleanData()
cogs = Department('COGS')
cogs.loadCourses('COGS_CAPE.txt')
cogs.loadDescription('COGS_Description.txt')
cogs.cleanData()
# calculate the average GPA of a given range (给CAPE reviews,算他们的平均GPA)
def avg_gpa(gpa):
gpa_float = gpa.astype(np.float)
mask = np.zeros(gpa_float.shape).astype(np.float)
mask = gpa_float != mask
total = np.count_nonzero(gpa_float)
total = total if total > 0 else 1
avg_gpa = np.sum(gpa_float,where=mask) / total
return round(avg_gpa,3)
# calculate the average GPA of a specific department
def GPA_by_department(department):
actual_gpa = department.courses[:,-1].astype(np.float)
expected_gpa = department.courses[:,-2].astype(np.float)
return avg_gpa(actual_gpa),avg_gpa(expected_gpa)
# calculate the average GPA of all the professors within a department
def GPA_by_professor(department):
res = []
for p in department.professors:
row = []; row.append(p)
mask = np.where(department.courses[:,2] == p)[0]
temp = department.courses[mask]
actual_gpa,expected_gpa = temp[:,-1],temp[:,-2]
avg1,avg2 = avg_gpa(actual_gpa),avg_gpa(expected_gpa)
row.append(avg1);row.append(avg2)
res.append(row)
res = np.array(res)
index = np.argsort(res[:,1].astype(np.float))[::-1]
return res[index]
# calculate the best courses within a department base on average GPA
def GPA_best_course(department):
all_courses = np.array((list(set(department.courses[:,0]))))
res = []
for p in all_courses:
row = []; row.append(p)
mask = np.where(department.courses[:,0] == p)[0]
temp = department.courses[mask]
actual_gpa,expected_gpa = temp[:,-1],temp[:,-2]
avg1,avg2 = avg_gpa(actual_gpa),avg_gpa(expected_gpa)
row.append(avg1); row.append(avg2)
res.append(row)
res = np.array(res)
index = np.argsort(res[:,1].astype(np.float))[::-1]
return res[index]
# calculate the best courses within a department base on average time spent
def GPA_best_time(department):
all_courses = np.array((list(set(department.courses[:,0]))))
res = []
for p in all_courses:
row = []; row.append(p)
mask = np.where(department.courses[:,0] == p)[0]
temp = department.courses[mask]
time_span = temp[:,6]
avg_time = avg_gpa(time_span)
row.append(avg_time)
res.append(row)
res = np.array(res)
# print(res)
index = np.argsort(res[:,1].astype(np.float))[::-1]
return res[index]
names =['ECE','CSE','COGS','MATH']
departments = [ece,cse,cogs,math]
actual_gpa_department,expected_gpa_department = [],[]
colors = ['lightslategray,gray']
layout = go.Layout(yaxis=dict(range=[2.5,4]))
for item in departments:
res = GPA_by_department(item)
actual_gpa_department.append(res[0])
expected_gpa_department.append(res[1])
fig = go.Figure(data=[
go.Bar(name='Actual GPA', x=names, y=actual_gpa_department,marker_color=['skyblue']*5,opacity=0.5),
go.Bar(name='Expected GPA', x=names, y=expected_gpa_department, marker_color=['blue']*5,opacity=0.5),
],layout=layout)
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
temp = GPA_by_professor(ece)
actual_gpa_prof = list(temp[:5,1])
expected_gpa_prof = list(temp[:5,2])
fig = go.Figure(data=[
go.Bar(name='Actual GPA', x= list(temp[:5,0]), y=actual_gpa_prof,marker_color=['skyblue']*5,opacity=0.5),
go.Bar(name='Expected GPA', x= list(temp[:5,0]), y=expected_gpa_prof,marker_color=['blue']*5,opacity=0.5)
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
temp = GPA_by_professor(ece)
actual_gpa_prof = list(temp[-19:-14,1])
expected_gpa_prof = list(temp[-19:-14,2])
layout = go.Layout(yaxis=dict(range=[0,4]))
fig = go.Figure(data=[
go.Bar(name='Actual GPA', x= list(temp[-19:-14,0]), y=actual_gpa_prof,marker_color=['skyblue']*5,opacity=0.5),
go.Bar(name='Expected GPA', x= list(temp[-19:-14,0]), y=expected_gpa_prof,marker_color=['blue']*5,opacity=0.5)
],layout = layout)
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
temp = GPA_best_course(ece)
actual_gpa_courses = list(temp[:5,1])
expected_gpa_courses = list(temp[:5,2])
layout = go.Layout(yaxis=dict(range=[2.5,4]))
fig = go.Figure(data=[
go.Bar(name='Actual GPA', x= list(temp[:5,0]), y=actual_gpa_courses,marker_color=['skyblue']*5,opacity=0.5),
go.Bar(name='Expected GPA', x= list(temp[:5,0]), y=expected_gpa_courses,marker_color=['blue']*5,opacity=0.5)
],layout = layout)
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
temp = GPA_best_time(ece)
time_span_ece = list(temp[:5,1])
layout = go.Layout(yaxis=dict(range=[12,14]))
fig = go.Figure(data=[
go.Bar(name='Actual GPA', x= list(temp[:5,0]), y=time_span_ece)
],layout = layout)
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
# print(temp)
mask = np.where(ece.courses[:,0] == 'ECE 101')[0]
ece101 = ece.courses[mask]
ece101_quarter = ece101[:,3]
ece101_gpa = ece101[:,-1].astype(np.float)
ece101_gpa2 = ece101[:,-2].astype(np.float)
fig = go.Figure()
fig.add_trace(go.Scatter(x=ece101_quarter, y=ece101_gpa, name='ECE 101 actual GPA',
line=dict(color='blue', width=4)))
fig.add_trace(go.Scatter(x=ece101_quarter, y=ece101_gpa2, name='ECE 101 expected GPA',
line=dict(color='skyblue', width=4)))
fig.update_xaxes(title_text="quarter")
fig.update_yaxes(title_text="GPA")
# calculate the average gpa of a specific professor who has taught ECE 101 in the past
professor_set = set(ece101[:,2])
res = []
for p in list(professor_set):
row = []
row.append(p)
mask = np.where(ece101[:,2]==p)[0]
target = ece101[mask]
avg = avg_gpa(target[:,-1])
row.append(avg)
res.append(row)
res = np.array(res)
index = np.argsort(res[:,1].astype(np.float))[::-1]
res = res[index]
names = res[:,0]
ece101_gpa = res[:,1]
# layout = go.Layout(yaxis=dict(range=[1.5,10]))
cs = [i for i in range(12,0,-1)]
layout = go.Layout(yaxis=dict(range=[1.5,4]))
fig = go.Figure(data=[
go.Bar(name='gpa', x=names, y=ece101_gpa,marker={'color':cs,'colorscale': 'blues'})
],layout=layout)
# Change the bar mode
fig.update_layout(barmode='overlay')
fig.update_layout(plot_bgcolor='whitesmoke')
fig.show()
# return a 2d array and a 1d array
# the 2d array has the course number and the year
# the 1d contains the average GPA history of that course from 2007 to 2019
def GPA_Matrix(department):
course_set = set(department.courses[:,0])
res = []
years = ['07','08','09','10','11','12','13','14','15','16','17','18','19']
for course in course_set:
row = []
records = department.courses[np.where(department.courses[:,0] == course)[0]]
for year in years:
count,total = 0,0
for record in records:
if record[3][2:] == year:
total += record[-1].astype(np.float); count += 1
if count != 0: row.append(total / count)
else: row.append(0.0)
res.append(row)
return np.array(list(course_set)),np.array(res)
courses,z = GPA_Matrix(ece)
z = z[np.argsort(np.sum(z,axis = 1))[::-1]]
base = [2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019]
fig = go.Figure(data=go.Heatmap(
z=z,
x=base,
y=courses[20:30],
colorscale='blues'))
fig.update_layout(
title='GPA ECE Courses',
xaxis_nticks=20)
fig.show()
# extract average time spent and average gpa of all the CAPE reviews from four departemnts
ece_time = ece.courses[:,[6,-1]].astype(np.float)
cse_time = cse.courses[:,[6,-1]].astype(np.float)
math_time = math.courses[:,[6,-1]].astype(np.float)
cogs_time = cogs.courses[:,[6,-1]].astype(np.float)
# transform numpy array to pandas dataframe, label each dataframe as their department names
dic_ece = {'time':ece_time[:,0],'gpa':ece_time[:,1],'label':['ece']*ece_time.shape[0]}
df_ece= pd.DataFrame(dic_ece)
dic_cse = {'time':cse_time[:,0],'gpa':cse_time[:,1],'label':['cse']*cse_time.shape[0]}
df_cse = pd.DataFrame(dic_cse)
dic_cogs = {'time':cogs_time[:,0],'gpa':cogs_time[:,1],'label':['cogs']*cogs_time.shape[0]}
df_cogs= pd.DataFrame(dic_cogs)
dic_math = {'time':math_time[:,0],'gpa':math_time[:,1],'label':['math']*math_time.shape[0]}
df_math = pd.DataFrame(dic_math)
# concatenate all the dataframes
frames = [df_ece,df_cse,df_cogs,df_math]
df = pd.concat(frames)
# histogram of time spent
fig = px.histogram(df,
title='Hours spent per week',
x='time',
color="label",
opacity=0.5,
color_discrete_sequence=['honeydew','skyblue','blue','darkblue'],
# color_discrete_sequence=['darkblue','blue','skyblue','lightskyblue']
orientation='v'
)
fig.update_layout(barmode='stack')
fig.show()
fig = go.Figure(data=go.Scatter(x=df_ece['time'], y=df_ece['gpa'], mode='markers',name='ece'))
fig.update_xaxes(title_text="time")
fig.update_yaxes(title_text="GPA")
fig.show()
from plotly.subplots import make_subplots
fig = make_subplots(
rows=2, cols=2,
subplot_titles=("ECE Department", "MATH Department", "COGS Department", "CSE Department"))
# layout = go.Layout(yaxis=dict(range=[0,4]),xaxis=dict(range=[0,20]))
fig.append_trace(go.Scatter(x=df_ece['time'], y=df_ece['gpa'], mode='markers',name='ece'),row=1,col=1)
fig.append_trace(go.Scatter(x=df_math['time'], y=df_math['gpa'], mode='markers',name='math'),row=1,col=2)
fig.append_trace(go.Scatter(x=df_cogs['time'], y=df_cogs['gpa'], mode='markers',name='cogs'),row=2,col=1)
fig.append_trace(go.Scatter(x=df_cse['time'], y=df_cse['gpa'], mode='markers',name='cse'),row=2,col=2)
fig.update_xaxes(title_text="time",range=[0,20],row=1,col=1)
fig.update_xaxes(title_text="time",range=[0,20],row=1,col=2)
fig.update_xaxes(title_text="time",range=[0,20],row=2,col=1)
fig.update_xaxes(title_text="time",range=[0,20],row=2,col=2)
fig.update_yaxes(title_text="GPA",range=[1,4.5],row=1,col=1)
fig.update_yaxes(title_text="GPA",range=[1,4.5],row=1,col=2)
fig.update_yaxes(title_text="GPA",range=[1,4.5],row=2,col=1)
fig.update_yaxes(title_text="GPA",range=[1,4.5],row=2,col=2)
fig.show()
ml_courses = ['ECE 175A','MATH 181D','CSE 152B', 'CSE 151A','CSE 184','CSE 158','COGS 185','COGS 188','COGS 118A','COGS 118B','COGS 9','COGS 108']
ml_courses_stat = []
for item in ml_courses:
temp = None
row = []
if (item[:3] == 'ECE'): temp = ece.courses[np.where(ece.courses==item)[0]]
elif (item[:3] == 'CSE'): temp = cse.courses[np.where(cse.courses==item)[0]]
elif (item[:3] == 'MATH'): temp = math.courses[np.where(math.courses==item)[0]]
else:
temp = cogs.courses[np.where(cogs.courses==item)[0]]
row.append(item)
gpa = temp[:,-2]
average_gpa = avg_gpa(gpa)
row.append(average_gpa)
time_span = temp[:,6]
average_time = avg_gpa(time_span)
row.append(average_time)
ml_courses_stat.append(row)
ml_courses_stat = np.array(ml_courses_stat)
ml_courses_stat[1][1] = str(math.gpa_median)
ml_courses_stat[1][2] = str(math.time_median)
ml_courses_stat[2][1] = str(cse.gpa_median)
ml_courses_stat[2][2] = str(cse.time_median)
# mat_sort = mat[mat[:,2].argsort()]
ml_courses_stat = ml_courses_stat[ml_courses_stat[:,1].argsort()]
names = ml_courses_stat[:,0]
ml_courses_gpa = ml_courses_stat[:,1]
cs = [i for i in range(20,0,-1)]
layout = go.Layout(yaxis=dict(range=[2.5,4]))
fig = go.FigureWidget(data=[go.Bar(x=names, y=ml_courses_gpa,marker={'color':cs,'colorscale': 'blues'})])
fig.update_layout(barmode='overlay')
fig.update_layout(plot_bgcolor='whitesmoke')
fig.update_yaxes(title_text="GPA")
fig.show()
names = ml_courses_stat[:,0]
ml_courses_stat = ml_courses_stat[ml_courses_stat[:,2].argsort()[::-1]]
ml_courses_time = ml_courses_stat[:,2]
# layout = go.Layout(yaxis=dict(range=[2.5,10]))
cs = [i for i in range(20,0,-1)]
fig = go.Figure(data=[
go.Bar(name='gpa', x=names, y=ml_courses_time,marker={'color':cs,'colorscale': 'blues'})
])
# Change the bar mode
fig.update_layout(barmode='overlay')
fig.update_layout(plot_bgcolor='whitesmoke')
fig.update_yaxes(title_text="GPA")
fig.show()
dic_ml = {'label' : ml_courses_stat[:,0],'gpa':ml_courses_stat[:,1].astype(np.float),
'time':ml_courses_stat[:,2].astype(np.float)}
df_ml = pd.DataFrame(dic_ml)
fig = px.scatter(df_ml, x="time", y="gpa",text="label")
fig.update_traces(textposition='top center')
fig.update_xaxes(title_text="time")
fig.update_yaxes(title_text="GPA")
fig.update_layout(showlegend=True)
fig.show()
# sort the professor base on time
def TIME_by_professor(department):
res = []
for p in department.professors:
row = []; row.append(p)
mask = np.where(department.courses[:,2] == p)[0]
temp = department.courses[mask]
allTime = temp[:,6]
avg_time = avg_gpa(allTime)
row.append(avg_time)
res.append(row)
res = np.array(res)
index = np.argsort(res[:,1].astype(np.float))[::-1]
return res[index]
temp = TIME_by_professor(ece)
avg_time= list(temp[:5,1])
avg_worst = list(temp[-5:,1])
layout = go.Layout(yaxis=dict(range=[0,20]))
fig = go.Figure(data=[
go.Bar(name='Most Time Spent', x= list(temp[:5,0]), y=avg_time,marker_color=['blue']*5,opacity=0.5,text=avg_time),
go.Bar(name='Least Time Spent', x= list(temp[-5:,0]), y=avg_worst,marker_color=['skyblue']*5,opacity=0.5,text=avg_time),
],layout = layout)
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()